syntax = "proto3";

package tensorflow.profiler;

import "tensorflow/core/profiler/protobuf/diagnostics.proto";
import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
import "tensorflow/core/profiler/protobuf/op_metrics.proto";
import "tensorflow/core/profiler/protobuf/power_metrics.proto";
import "tensorflow/core/profiler/protobuf/steps_db.proto";
import "tensorflow/core/profiler/protobuf/tf_function.proto";
import "tensorflow/core/profiler/protobuf/topology.proto";

// Performance environment, e.g the peak performance capabilities of the device.
message PerfEnv {
  // Peak performance of a TPU core or a GPU in TFLOP/s.
  double peak_tera_flops_per_second = 1;
  // Peak memory bandwidth of a TPU core or a GPU in GiBs/s.
  double peak_bw_giga_bytes_per_second = 4;
  // Peak off-chip memory bandwidth of a TPU core or a GPU in GiBs/s.
  double peak_hbm_bw_giga_bytes_per_second = 2;
  // Peak memory bandwidths of a TPU core or a GPU in GiBs/s.
  // Index into array using MemBwType enum.
  // TODO: remove the 2 above fields and bump up the proto version to maintain
  // backwards compatibility.
  repeated double peak_bws_giga_bytes_per_second = 5;
  // The ridge point of roofline model in FLOP/Byte. (i.e., minimum operational
  // intensity required to achieve maximum performance).
  double ridge_point = 3;
}

// Result proto for host-independent job information.
message HostIndependentJobInfoResult {
  // The change-list number of this build.
  int64 change_list = 1;
  // The time of this build (nanoseconds since the Unix epoch).
  int64 build_time = 2;
  // The target of this build.
  string build_target = 3;
  // Profiling duration (in ms).
  uint32 profile_duration_ms = 4;
}

// Result proto for host-dependent job information.
message HostDependentJobInfoResult {
  // This ID of the host where the job was run on.
  string host_id = 1;
  // The command line used to run the job.
  string command_line = 2;
  // The start time of this run (nanoseconds since the Unix epoch).
  int64 start_time = 3;
  // BNS address specified by client at time of profiling request.
  string bns_address = 4;
  // Profiling start walltime (in ns).
  uint64 profile_time_ns = 5;
}

// System topology, which describes the number of chips in a pod
// and the connectivity style.
message SystemTopology {
  // The X, Y, and Z dimensions of this topology. 0 means that dimension does
  // not exist.
  int64 x_dimension = 1;
  int64 y_dimension = 2;
  int64 z_dimension = 3;
  // The number of expected bad chips in this system.
  int64 num_expected_reduced_chips = 4;
}

// The run environment of a profiling session.
message RunEnvironment {
  // Number of hosts used.
  int32 host_count = 1;
  // Number of tasks used.
  int32 task_count = 2;
  // Distinct hostnames seen.
  map<string, bool> hostnames = 3;
  // The type of device used.
  string device_type = 4;
  // The number of device cores used.
  //   In TPU case, this corresponds to the number of TPU cores
  //   In GPU case, this corresponds to the number of GPUs (not the number of
  //   SMs).
  int32 device_core_count = 5;
  // Host-independent information about this job.
  HostIndependentJobInfoResult host_independent_job_info = 7;
  // Host-dependent information about this job.
  repeated HostDependentJobInfoResult host_dependent_job_info = 8;
  // The number of replicas, corresponds to input parallelism.
  // If there is no model parallelism, replica_count = device_core_count
  int32 replica_count = 9;
  // The number of cores used for a single replica, e.g. model parallelism.
  // If there is no model parallelism, then num_cores_per_replica = 1
  int32 num_cores_per_replica = 10;
  // The chip interconnection topology.
  SystemTopology topology = 11 [deprecated = true];
  // Host trace level.
  uint32 host_trace_level = 12;
  // The chip and host interconnection topology.
  Topology system_topology = 13;
  // Whether it is a training analysis or inference analysis.
  bool is_training = 14;
  // Power Metrics for TPU.
  PowerMetrics power_metrics = 15;
  reserved 6;
}

// Next ID: 7
message CoreDetails {
  string hostname = 1;
  uint32 device_ordinal = 2;  // unique within host, TPU core only
  uint32 core_num = 3;        // unique within chip per core type
  uint32 local_chip_id = 4;   // unique within host
  uint32 global_chip_id = 5;  // unique within mesh
  uint32 global_core_id = 6;  // unique within mesh, TPU core only
}

// Metrics based on hardware performance counters.
message PerformanceCounterResult {
  // Overall matrix unit utilization in percentage.
  double matrix_unit_utilization_percent = 1;
}

// Next ID: 14
// Operator Statistics.
message OpStats {
  // The database for the op metrics collected from the host over the entire
  // profiling session including incomplete steps.
  OpMetricsDb host_op_metrics_db = 1;
  // The database for the op metrics collected from the device over the entire
  // profiling session including incomplete steps.
  OpMetricsDb device_op_metrics_db = 2;
  // The result for the HLO-metric database over the complete steps only.
  OpMetricsDb hlo_metrics_db_complete_steps_only = 10;
  // Performance environment of the op metrics collected.
  PerfEnv perf_env = 3;
  // The database of step sequences.
  StepDatabaseResult step_db = 4;
  // The run environment of this profiling session.
  RunEnvironment run_environment = 5;
  // Kernel stats results from all GPUs.
  KernelStatsDb kernel_stats_db = 6;
  // Statistics for all tf-functions.
  TfFunctionDb tf_function_db = 8;
  // A map from core ID to details.
  map<uint32, CoreDetails> core_id_to_details = 11;
  // Error and warning messages for diagnosing profiling issues.
  Diagnostics diagnostics = 9;
  // A map from program ID to program name.
  map<uint64, string> program_id_to_name_map = 12;
  // Performance counters.
  PerformanceCounterResult performance_counter_result = 13;
  reserved 7;
}
